Tidy Tuesday

I came accross the following youtube channel where David Robinson does analyses on different datasets. I saw the following video where he performed an analysis on NYC squirrels and thought I would give it a shot of my own!

Libraries

The following libraries will be used in this analysis. If it is your first time using these packages, also use the ‘install.packages’ function.

#install.packages("rlang")
library(rlang)
#install.packages("ggmap")
library(ggmap)
#install.packages("gam")
library(gam)
#install.packages("gridExtra")
library(gridExtra)





Read in Data

nyc_squirrels <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-29/nyc_squirrels.csv")

# Convert to Data Frame
sq_dat <- as.data.frame(nyc_squirrels)





Prep Data

Remove Variables

# The following plots are how I new to drop a few of the fields
# Using some quick and very ugly plots, we can quickly see that some of these fields are useless at the moment.
#    In the event that the squirrel dataset is added to using sightings from different locations across NYC, this may change
par(mfrow=c(2,3),bg = '#222222')
for(var in c("community_districts","borough_boundaries","city_council_districts","police_precincts","zip_codes")){
  plot_dist <- na.omit(unique(sq_dat[,c("lat","long",var)]))
  plot(plot_dist$long,plot_dist$lat,col=c("black","blue","red","purple","green")[as.factor(plot_dist[,var])],pch=15,xlab="Longitude",ylab="Latitude",axis="",adj=0,main=var)
}
par(mfrow=c(1,1))

Add Variables

data <- sq_dat[,!(colnames(sq_dat) %in% c(
"above_ground_sighter_measurement",
"combination_of_primary_and_highlight_color",
"unique_squirrel_id",
"lat_long",
"zip_codes",
"community_districts",
"borough_boundaries",
"city_council_districts",
"police_precincts"    
))]
# Needed to overlay plots on maps
data[,"lon"] <- data[,"long"]
data[,"adult"] <- ifelse(data[,"age"]=="Adult",TRUE,FALSE)
data[,"above_ground"] <- ifelse(data[,"location"]=="Above Ground",TRUE,FALSE)

data[,"primary_Gray"] <- ifelse(data[,"primary_fur_color"]=="Gray",TRUE,FALSE)
data[,"primary_Cinnamon"] <- ifelse(data[,"primary_fur_color"]=="Cinnamon",TRUE,FALSE)
data[,"primary_Black"] <- ifelse(data[,"primary_fur_color"]=="Black",TRUE,FALSE)
data[,"primary_Gray"]     <- ifelse(is.na(data[,"primary_fur_color"]),FALSE,TRUE)
data[,"primary_Cinnamon"] <- ifelse(is.na(data[,"primary_fur_color"]),FALSE,TRUE)
data[,"primary_Black"]    <- ifelse(is.na(data[,"primary_fur_color"]),FALSE,TRUE)

data[,"colors"] <- apply(data[,c("primary_fur_color","highlight_fur_color")],1,paste,collapse=",")


data[,"Gray"]     <- grepl("Gray",data[,"colors"])
data[,"Cinnamon"] <- grepl("Cinnamon",data[,"colors"])
data[,"Black"]    <- grepl("Black",data[,"colors"])
data[,"White"]    <- grepl("White",data[,"colors"])

data[,"UniqueColors"]    <- apply(data[,c("Gray","Cinnamon","Black","White")],1,sum)
data[,"MultiColor"]      <- ifelse(data[,"UniqueColors"]>1,TRUE,ifelse(data[,"UniqueColors"]==0,NA,FALSE))

# Convert hectare to grid x and y values
data[,"y"] <- as.numeric(substr(data[,"hectare"],1,2))
data[,"x"] <- apply(as.data.frame(substr(data[,"hectare"],3,3)),1,function(x){(1:9)[x==c("A","B","C","D","E","F","G","H","I")]})

Visualizations

Data Domain

# Plot showing domain of sightings
map <- get_map(location = c(min(data$long), min(data$lat), max(data$long), max(data$lat)), source="google", maptype="satellite", crop=FALSE)
dput(map, file = "myMaps")
map <- dget(file = "myMaps")

ggmap(map)+
  geom_point(data=data[,c("lon","lat")])+
  theme(plot.background = element_rect(fill = '#222222', colour = '#222222'),
        panel.background = element_rect(fill = '#222222', colour = '#222222'))

What is a hectare?

# This illustrates that hectare is just used to reorient the values to be rectangular instead of angled 
hect <- aggregate(data,by=list(data[,"hectare"]),mean)
n <- aggregate(rep(1,nrow(data)),by=list(data[,"hectare"]),sum)
hect <- cbind(n[,2],hect)
colnames(hect)[1] <- "n"
size <- 1+6*hect[,"n"]/max(hect[,"n"])

ggmap(map)+
  geom_point(data=hect[,c("lon","lat")],aes(colour=factor(hect[,"Group.1"])),size=size)+
  theme(legend.position="none",
        plot.background = element_rect(fill = '#222222', colour = '#222222'),
        panel.background = element_rect(fill = '#222222', colour = '#222222'))

When was this data collected?

# What date range was this data collected over?
date_to_text <- function(x){
  # Thank goodness this all happend in the same month and didnt happen both after the 9th and before the 10th!
  date <- paste(substring(x,5,9),substring(x,1,2),substring(x,3,4),sep="-")
  day <- weekdays(as.Date(date))
  #empties <- 10-as.data.frame(apply(as.data.frame(day),1,nchar))
  #print(min(empties))
  #apply(empties,1,rep," ")
  return(paste(date,day,sep="\n"))
}


data[,"date_chr"] <- apply(data[,"date",drop=FALSE],1,date_to_text)
head(data[,"date_chr"])
## [1] "2018-10-14\nSunday"    "2018-10-06\nSaturday"  "2018-10-10\nWednesday"
## [4] "2018-10-18\nThursday"  "2018-10-18\nThursday"  "2018-10-19\nFriday"
samples_by_date <- aggregate(data[,"date_chr"],by=list(data[,"date_chr"]),function(x){length(x)})

# Default is c(5, 4, 4, 2) + 0.1
# Give extra space to show full date
par(mar=c(5.5, 8, 4, 2) + 0.1,mfrow=c(1,1),bg = '#222222')
counts <- table(data[,"date_chr"],by=data[,"shift"])
barplot(t(counts), main="Sightings by Date - AM vs PM",
        xlab="Sightings" ,horiz=TRUE,las=2,cex.axis=1.25,cex.names=1.25,cex.lab=1.25,adj=0,cex.main=2,col = c("orange","#000055"))

Spatial Function

models <- list()
# List of all boolean variables
variables <- c("Gray","Black","White","Cinnamon","MultiColor","above_ground","adult","foraging","running","chasing","climbing","eating","kuks","quaas","moans","tail_flags","tail_twitches","approaches","indifferent","runs_from")

plot_var <- function(variable){
  # Predict variable using latitude and longitude
  form <-as.formula(paste(variable,"~s(lon,lat)",sep=""))
  fit <- mgcv::gam(formula=form,data=data[,c("lat","lon",variable)], family = binomial("logit"))
  models <- append(models,fit)
  pval <- paste(round(summary(fit)$s.pv,4)*100,"%",sep="")
  rsq <- paste(round(summary(fit)$r.sq,4)*100,"%",sep="")
  avg <- round(100*mean(na.omit(data[,variable])),digits = 1)
  
  # Get predictions for corresponding hectares
  preds <- predict(fit,newdata=hect,type="response")
  
  
  # Get predictions to create an overall heatmap
  miny <- min(data$lat)
  maxy <- max(data$lat)
  minx <- min(data$lon)
  maxx <- max(data$lon)
  
  
  
  grid <- expand.grid(seq(minx,maxx,(maxx-minx)/40),seq(miny,maxy,(maxy-miny)/60))
  colnames(grid) <- c("lon","lat")
  grid_preds <- predict(fit,newdata=grid,type="response")
  grid <- cbind(grid,grid_preds)
  
  
  
  
  
  #Plot Actual and predicted given latitude and longitude
  
    # size of grids are always at least 1, and up to 6. The largest sample was max(hect[,"n"]) = 32
    size <- 1+6*hect[,"n"]/max(hect[,"n"])
    # Average by grid
    raw_plot <- 
      ggmap(map)+
      theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), 
            axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())+
      geom_point(data=hect[,c("lon","lat")],aes(colour=hect[,variable]),size=size) +
      scale_color_gradientn(colours = rainbow(5), limits=c(0,1),name=variable) 
      
    
    # Prediction by grid
    pred_plot <-
      ggmap(map)+
      theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), 
            axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())+
      geom_point(data=hect[,c("lon","lat")],aes(colour=preds),size=size) +
      scale_color_gradientn(colours = rainbow(5), limits=c(0,1),breaks=c(round(min(preds),3),round(max(preds),3)),name="Predictions")+ 
      geom_text(data=hect[preds==max(preds),,drop=FALSE][1,],aes(lon,lat,label=paste(100*round(max(preds),3),"%")),size=5)+
      geom_text(data=hect[preds==min(preds),,drop=FALSE][1,],aes(lon,lat,label=paste(100*round(min(preds),3),"%")),size=5)

    # Prediction by grid (make it more like a heat map)
    pred_heatmap <-
      ggmap(map)+
      theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), 
            axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())+
      geom_point(data=grid,aes(colour=grid_preds),size=5,shape=18,alpha=.4) +
      scale_color_gradientn(colours = rainbow(5), limits=c(0,1),breaks=c(round(min(preds),3),round(max(preds),3)),name="HeatMap")+ 
      geom_text(data=hect[preds==max(preds),,drop=FALSE][1,],aes(lon,lat,label=paste(100*round(max(preds),3),"%")),size=5)+
      geom_text(data=hect[preds==min(preds),,drop=FALSE][1,],aes(lon,lat,label=paste(100*round(min(preds),3),"%")),size=5) 
    
    grid.arrange(raw_plot, pred_plot, pred_heatmap, ncol = 1,top=paste(toupper(variable),": \nAverage Value: ",avg,"%\nR-Squared: ",rsq,"\nP-Value: ", pval,sep=""))
    par(mfrow=c(1,1))
}





Heatmaps

Gray

This variable descirbes if the squirrel had any grayness present, primary or otherwise.
par(bg = '#222222')
plot_var("Gray")

Black

This variable descirbes if the squirrel had any black present, primary or otherwise.

par(bg = '#222222')
plot_var("Black")

Signature